In [ ]:
%run "../Functions/1. Google form analysis.ipynb"
In [ ]:
perSessionRelevantColumns = ['sessionId', 'serverTime', 'section']
#reachEvents = rmdf152[rmdf152['type']=='reach'].loc[:,perSessionRelevantColumns]
#deathEvents = rmdf152[rmdf152['type']=='death'].loc[:,perSessionRelevantColumns]
timedSectionsIndex = [
'tutorial1.Checkpoint00',
'tutorial1.Checkpoint01',
'tutorial1.Checkpoint02',
'tutorial1.Checkpoint03',
'tutorial1.Checkpoint04',
'tutorial1.Checkpoint05',
'tutorial1.Checkpoint06',
'tutorial1.Checkpoint07',
'tutorial1.Checkpoint08',
'tutorial1.Checkpoint09',
'tutorial1.Checkpoint10',
'tutorial1.Checkpoint11',
'tutorial1.Checkpoint12',
'tutorial1.Checkpoint13',
'tutorial1.Checkpoint14',
]
timedSectionsReachedColumns = ['firstReached', 'firstCompletionDuration']
timedSectionsDeathsColumns = ['deathsCount']
eventSectionsCountColumns = ['section', 'count']
eventSectionsColumns = ['count']
In [ ]:
## Comparison between game and Google form performance
In [ ]:
# Returns a given session's checkpoints, the first server time at which they were reached, and completion time
def getCheckpointsTimes( sessionId, _rmDF = rmdf152 ):
reachEvents = _rmDF[_rmDF['type']=='reach'].loc[:,perSessionRelevantColumns]
perSession = reachEvents[reachEvents['sessionId']==sessionId]
perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
timedSections = pd.DataFrame(data=0, columns=timedSectionsReachedColumns,index=timedSectionsIndex)
timedSections['firstReached'] = pd.Timestamp(0, tz='utc')
timedSections['firstCompletionDuration'] = pd.Timedelta.max
if(len(perSession) > 0):
timedSections["firstReached"] = perSession.groupby("section").agg({ "serverTime": np.min })
timedSections["firstCompletionDuration"] = timedSections["firstReached"].diff()
if(timedSections.loc["tutorial1.Checkpoint00","firstReached"] != pd.Timestamp(0, tz='utc')):
timedSections.loc["tutorial1.Checkpoint00","firstCompletionDuration"] = \
pd.Timedelta(0)
timedSections["firstReached"] = timedSections["firstReached"].fillna(pd.Timestamp(0, tz='utc'))
timedSections["firstCompletionDuration"] = timedSections["firstCompletionDuration"].fillna(pd.Timedelta.max)
return timedSections
In [ ]:
# Returns a given user's checkpoints, the first server time at which they were reached, and completion time
def getCheckpointsTimesUser( userId, _sessionsList = [], _rmDF = rmdf152 ):
# List of associated sessions
if( len(_sessionsList) == 0):
_sessionsList = getUserSessions(_rmDF, userId)
# Call getCheckpointsTimes on all sessions associated with user,
# then merge by taking oldest checkpoint completion
_timedSections = pd.DataFrame(data=0, columns=timedSectionsReachedColumns,index=timedSectionsIndex)
_timedSections["firstReached"] = pd.Timestamp(0, tz='utc')
_timedSections["firstCompletionDuration"] = pd.Timedelta.max
# merge
# for each checkpoint reached, update if necessary
for _sessionId in _sessionsList:
_thisSessionTimes = getCheckpointsTimes( _sessionId )
for _checkpointName in _thisSessionTimes.index:
if ((_thisSessionTimes.loc[_checkpointName, 'firstReached'] != pd.Timestamp(0, tz='utc'))
and
((_timedSections.loc[_checkpointName, 'firstReached'] == pd.Timestamp(0, tz='utc'))
or (_timedSections.loc[_checkpointName, 'firstReached'] > _thisSessionTimes.loc[_checkpointName, 'firstReached']))
):
_timedSections.loc[_checkpointName, 'firstReached'] = _thisSessionTimes.loc[_checkpointName, 'firstReached']
_timedSections.loc[_checkpointName, 'firstCompletionDuration'] = _thisSessionTimes.loc[_checkpointName, 'firstCompletionDuration']
return _timedSections
In [ ]:
def getPlayedTimeSessionMode(sessionEvents, mode):
sessionTimes = sessionEvents[sessionEvents['section'].str.startswith(mode, na=False)]['userTime']
sessionTimes.index = sessionTimes.values
daysSpent = set()
totalSpentTime = pd.Timedelta(0)
if(len(sessionTimes) > 0):
sessionTimes = sessionTimes.groupby(pd.TimeGrouper('D')).agg({ "start": np.min, "end": np.max })
daysSpent = set(sessionTimes.index)
sessionTimes['played'] = sessionTimes['end'] - sessionTimes['start']
totalSpentTime = sessionTimes['played'].sum()
return {'daysSpent': daysSpent, 'totalSpentTime': totalSpentTime}
In [ ]:
# Returns a given session's total playtime and day count
def getPlayedTimeSession( sessionId, _rmDF = rmdf152 ):
sessionEvents = _rmDF[_rmDF['sessionId']==sessionId]
tutorialTime = getPlayedTimeSessionMode(sessionEvents, 'tutorial')
sandboxTime = getPlayedTimeSessionMode(sessionEvents, 'sandbox')
return {'tutorial': tutorialTime, 'sandbox': sandboxTime}
In [ ]:
def mergePlayedTimes(a, b):
result = a.copy()
for gameMode in a:
result[gameMode] = {
'totalSpentTime': a[gameMode]['totalSpentTime'] + b[gameMode]['totalSpentTime'],
'daysSpent': a[gameMode]['daysSpent'] | b[gameMode]['daysSpent'],
}
return result
In [ ]:
# Returns a given user's total playtime and day count
def getPlayedTimeUser( userId, _sessionsList = [], _rmDF = rmdf152 ):
result = getPlayedTimeSession('', _rmDF = _rmDF)
if(len(_sessionsList) == 0):
_sessionsList = getUserSessions(_rmDF, userId)
for session in _sessionsList:
playedTimes = getPlayedTimeSession(session, _rmDF)
result = mergePlayedTimes(result, playedTimes)
return result
In [ ]:
# Returns a given session's checkpoints, and death count
def getDeaths( sessionId, _rmDF = rmdf152 ):
deathEvents = _rmDF[_rmDF['type']=='death'].loc[:,perSessionRelevantColumns]
perSession = deathEvents[deathEvents['sessionId']==sessionId]
perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
deathsSections = perSession.groupby("section").size().reset_index(name='deathsCount')
return deathsSections
In [ ]:
def getDeathsUser( userId, _rmDF = rmdf152 ):
#print("getDeathsUser(" + str(userId) + ")")
# List of associated sessions
sessionsList = getUserSessions(_rmDF, userId)
#print("sessionsList=" + str(sessionsList))
# Call getDeaths on all sessions associated with user,
# then merge by adding
deathsSections = pd.DataFrame(0, columns=timedSectionsDeathsColumns,index=timedSectionsIndex)
for sessionId in sessionsList:
#print("processing user " + str(userId) + " with session " + str(sessionId))
deaths = getDeaths( sessionId )
# merge
# for each checkpoint reached, update if necessary
for index in deaths.index:
#print("index=" + str(index))
checkpointName = deaths['section'][index]
#print("checkpointName=" + str(checkpointName))
#print("deaths['deathsCount']["+str(index)+"]=" + str(deaths['deathsCount'][index]))
deathsSections['deathsCount'][checkpointName] = deathsSections['deathsCount'][checkpointName] + deaths['deathsCount'][index]
return deathsSections
In [ ]:
# Static data
# craftEventsColumns = pd.DataFrame(
# index=list(range(4)),
# data={
# 'eventCode' : pd.Categorical(["equip","unequip","add","remove"]),
# 'eventType' : pd.Categorical(["add","remove","add","remove"]),
# 'column' : pd.Categorical(["customData.device","customData.device","customData.biobrick","customData.biobrick"]),
# }
#)
#craftEventsColumns
In [ ]:
# Static data
craftEventCodes = list(["equip","unequip","add","remove"])
craftEventsColumns = pd.DataFrame(
index=craftEventCodes,
data={
'eventType' : pd.Categorical(["add","remove","add","remove"]),
'column' : pd.Categorical(["customData.device","customData.device","customData.biobrick","customData.biobrick"]),
}
)
In [ ]:
# Returns a given session's checkpoints, and event count
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getSectionsCraftEvents( eventCode, sessionId, _rmDF = rmdf152 ):
#print("getSectionsCraftEvents(" + str(eventCode) + "," + str(sessionId) + ")")
sectionsEvents = pd.DataFrame(0, columns=eventSectionsCountColumns, index=range(0))
if eventCode in craftEventCodes:
eventType = craftEventsColumns['eventType'][eventCode]
events = _rmDF[_rmDF['type']==eventType]
events = events[events[craftEventsColumns['column'][eventCode]].notnull()]
#print("getSectionsCraftEvents(" + str(eventCode) + "," + str(sessionId) + "): #events=" + str(len(events)))
#print("events=" + str(events.head()))
events = events.loc[:,perSessionRelevantColumns]
perSession = events[events['sessionId']==sessionId]
perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
sectionsEvents = perSession.groupby("section").size().reset_index(name='count')
else:
print("incorrect event code '" + eventCode + "'")
return sectionsEvents
In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserSectionsCraftEvents( eventCode, userId, sessionsList = [], _rmDF = rmdf152 ):
#print("getUserSectionsCraftEvents(" + str(eventCode) + "," + str(userId) + ")")
# Call getSectionsEvents on all sessions associated with user,
# then merge by adding
userSectionsEvents = pd.DataFrame(0, columns=eventSectionsColumns,index=timedSectionsIndex)
if eventCode in craftEventCodes:
# List of associated sessions
if(len(sessionsList) == 0):
sessionsList = getUserSessions(_rmDF, userId)
#print("sessionsList=" + str(sessionsList))
for sessionId in sessionsList:
sessionSectionsEvents = getSectionsCraftEvents( eventCode, sessionId )
# merge
# for each checkpoint reached, update if necessary
for index in sessionSectionsEvents.index:
checkpointName = sessionSectionsEvents['section'][index]
userSectionsEvents['count'][checkpointName] = userSectionsEvents['count'][checkpointName] + sessionSectionsEvents['count'][index]
else:
print("incorrect event code '" + eventCode + "'")
return userSectionsEvents
In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserSectionsCraftEventsTotal( eventCode, userId, sessionsList = [] ):
#print("getUserSectionsCraftEventsTotal(" + str(eventCode) + "," + str(userId) + ")")
events = getUserSectionsCraftEvents( eventCode, userId, sessionsList )
return events.values.sum()
In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserCraftEventsTotal( eventCode, userId, sessionsList=[], _rmDF = rmdf152 ):
if(len(sessionsList) == 0):
sessionsList = getUserSessions(_rmDF, userId)
if eventCode in craftEventCodes:
eventType = craftEventsColumns['eventType'][eventCode]
events = _rmDF[_rmDF['type']==eventType]
events = events[events[craftEventsColumns['column'][eventCode]].notnull()]
perSession = events[events['sessionId'].isin(sessionsList)]
return len(perSession)
else:
print("incorrect event code '" + eventCode + "'")
return 0
In [ ]:
# Returns a given session's checkpoints, and event count
def getSectionsEvents( eventType, sessionId, _rmDF = rmdf152 ):
events = _rmDF[_rmDF['type']==eventType].loc[:,perSessionRelevantColumns]
perSession = events[events['sessionId']==sessionId]
perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
sectionsEvents = perSession.groupby("section").size().reset_index(name='count')
return sectionsEvents
In [ ]:
def getUserSectionsEvents( eventType, userId, sessionsList=[], _rmDF = rmdf152 ):
# List of associated sessions
if(len(sessionsList) == 0):
sessionsList = getUserSessions(_rmDF, userId)
# Call getSectionsEvents on all sessions associated with user,
# then merge by adding
userSectionsEvents = pd.DataFrame(0, columns=eventSectionsColumns,index=timedSectionsIndex)
for sessionId in sessionsList:
sessionSectionsEvents = getSectionsEvents( eventType, sessionId )
# merge
# for each checkpoint reached, update if necessary
for index in sessionSectionsEvents.index:
checkpointName = sessionSectionsEvents['section'][index]
userSectionsEvents['count'][checkpointName] = userSectionsEvents['count'][checkpointName] + sessionSectionsEvents['count'][index]
return userSectionsEvents
In [ ]:
def getUserSectionsEventsTotal( eventType, userId, sessionsList=[] ):
events = getUserSectionsEvents( eventType, userId, sessionsList )
return events.values.sum()
In [ ]:
def getUserEventsTotal( eventType, userId, sessionsList=[], _rmDF = rmdf152 ):
if(len(sessionsList) == 0):
sessionsList = getUserSessions(_rmDF, userId)
sessionEvents = _rmDF[_rmDF['type']==eventType]
perSession = sessionEvents[sessionEvents['sessionId'].isin(sessionsList)]
return len(perSession)
In [ ]:
# Returns a given user's unique reached checkpoints
def getUserCheckpoints( userId, _rmDF = rmdf152 ):
#print("getUserCheckpoints(" + str(userId) + ")")
# List of associated sessions
sessionsList = getUserSessions(_rmDF, userId)
#print("sessionsList=" + str(sessionsList))
# List all 'reach' events with those sessionIds.
reachEvents = _rmDF[_rmDF['type']=='reach'].loc[:,perSessionRelevantColumns]
perUser = reachEvents[reachEvents['sessionId'].isin(sessionsList)]
perUser = perUser[perUser['section'].str.startswith('tutorial', na=False)]
return pd.Series(perUser['section'].unique())
def getDiscrepancyGameGForm( userId ):
gformNonVal = getNonValidatedCheckpoints(userId)
gformVal = getValidatedCheckpoints(userId)
gameVal = getUserCheckpoints(userId)
#sorted, unique values in series1 that are not in series2
#np.setdiff1d(series1.values, series2.values)
#user has answered questions whose answer they haven't seen in the game
gameNotEnough = pd.Series(np.setdiff1d(gformVal.values, gameVal.values))
#user has not answered questions whose answer they have seen in the game
gformNotEnough = []
maxGameVal = ''
if gameVal.values.size!=0:
gameVal.values.max()
for nonVal in gformNonVal.values:
if nonVal >= maxGameVal:
gformNotEnough.append(nonVal)
gformNotEnough = pd.Series(gformNotEnough)
return (gameNotEnough, gformNotEnough)
In [ ]:
# Static data
noSectionEventCodes = list(['start', 'selectmenu', 'switch', 'restart',\
'gotourl', 'gotomooc', 'configure'])
In [ ]:
simpleEvents = [
'complete',
'configure',
'craft',
'death',
'equip',
'unequip',
'add',
'remove',
'gotomooc',
'gotourl',
'pickup',
'reach',
'restart',
'selectmenu',
'start',
'switch',
]
# possible events: complete configure craft death equip gotomooc gotourl pickup reach restart selectmenu start switch unequip
userDataVectorIndex = [#game
'sessionsCount',
]
for temporality in answerTemporalities:
userDataVectorIndex.append(scoreLabel + temporality)
userDataVectorIndex = np.concatenate( (userDataVectorIndex,
simpleEvents))
In [ ]:
#allEvents = rmdf152['type'].unique()
#allEvents = np.concatenate( simpleEvents, allEvents ).unique()
#allUserDataVectorIndex = np.concatenate( userDataVectorIndex, allEvents ).unique()
In [ ]:
# userId is RedMetrics user id
# _source is used as correction source, if we want to include answers to these questions
def getUserDataVector( userId, _source = correctAnswers, _rmDF = rmdf152 ):
sessionsList = getUserSessions(_rmDF, userId)
columnName = str(userId)
data = pd.DataFrame(0, columns=[columnName],index=userDataVectorIndex)
score = getScore( userId )
for _temporality in score.columns:
_score = score.loc[scoreLabel,_temporality]
if(len(_score)>0):
if(_temporality == 'before'):
_score = _score[len(_score)-1]
else:
_score = _score[0]
else:
_score = np.nan
data.loc[scoreLabel+_temporality,columnName] = _score
data.loc['sessionsCount',columnName] = len(sessionsList)
for eventName in simpleEvents:
if eventName in craftEventCodes:
data.loc[eventName,columnName] = getUserCraftEventsTotal(eventName, userId, sessionsList)
else:
data.loc[eventName,columnName] = getUserEventsTotal(eventName, userId, sessionsList)
data.loc['maxChapter', columnName] = int(pd.Series(data = 'tutorial1.Checkpoint00')\
.append(getUserCheckpoints(userId, _rmDF = _rmDF))\
.max()[-2:])
# time spent on each chapter
times = getCheckpointsTimesUser(userId)
completionTime = 0
chapterTime = pd.Series()
for chapter in timedSectionsIndex:
deltaTime = times.loc[chapter,"firstCompletionDuration"].total_seconds()
chapterTime.loc[int(chapter[-2:])] = deltaTime
completionTime += deltaTime
# efficiency = (1 + #unlockedchapters)/(time * (1 + #death + #craft + #add + #equip))
data.loc['efficiency', columnName] = np.log(( 1 + data.loc['maxChapter', columnName] ) / \
(completionTime \
* ( 1\
+ data.loc['death', columnName] \
+ data.loc['craft', columnName]\
+ data.loc['add', columnName]\
+ data.loc['equip', columnName]\
)\
))
playedTime = getPlayedTimeUser(userId, _rmDF = _rmDF)
data.loc['thoroughness', columnName] = \
data.loc['craft', columnName]\
* data.loc['pickup', columnName]\
* ( 1 + np.power(len(playedTime['sandbox']['daysSpent']),2))
totalSpentTime = playedTime['tutorial']['totalSpentTime'] + playedTime['sandbox']['totalSpentTime']
totalSpentDays = len(playedTime['tutorial']['daysSpent'] | playedTime['sandbox']['daysSpent'])
data.loc['fun', columnName] = np.log(\
max(1,\
totalSpentTime.total_seconds()
* np.power(totalSpentDays,2)
))
data.loc['completionTime', columnName] = completionTime
for time in chapterTime.index:
data.loc[time,columnName] = chapterTime.loc[time]
if(len(_source) != 0):
if(hasAnswered(userId)):
gformLine = gform[gform[localplayerguidkey] == userId]
afters = gformLine[gformLine['Temporality'] == 'after']
if(len(afters) > 0):
gformLine = afters.iloc[0]
else:
befores = gformLine[gformLine['Temporality'] == 'before']
if(len(befores) > 0):
gformLine = befores.iloc[len(befores)-1]
else:
gformLine = gformLine.iloc[len(gformLine)-1]
# add data from the gform: binary score on each question
gformData = getBinarized(gformLine, _source = _source)
for question in gformData.index:
data.loc[question,columnName] = gformData.loc[question]
else:
print("warning: user " + userId + " has never answered the survey")
return data
In [ ]:
# for per-session, manual analysis
def getSessionDataPreview( _sessionId, _rmDF = rmdf152 ):
_logs = _rmDF[_rmDF['sessionId'] == _sessionId]
_timedEvents = _logs['userTime']
_timedEvents = _timedEvents.sort_values()
_platform = _logs['customData.platform'].dropna().values
if(len(_platform) > 0):
_platform = _platform[0]
else:
_platform = ''
_events = _logs['type'].value_counts()
return {
'first' : _timedEvents.iloc[0],
'last' : _timedEvents.iloc[-1],
'platform' : _platform,
'events' : _events
}
In [ ]:
# for per-user, manual analysis
def getUserDataPreview(userId, _rmDF = rmdf152):
# [ ] RM
# [ ] sessions count
# [ ] first event date
# [ ] time played
# [ ] dates played
# [ ] first played, last played
# [ ] best chapter
# [ ] counts of events: deaths, crafts,...
# [ ] gaming platform
# [ ] GF
# [ ] score(s)
# [ ] progression
# [ ] temporality
# [ ] temporality according to answers
# [ ] #before
# [ ] #after
# [ ] demographics
result = pd.DataFrame(
columns = [userId]
)
# [ ] RM
result.loc['REDMETRICS ANALYSIS'] = ' '
# [ ] sessions count
sessions = getUserSessions(_rmDF, userId)
result.loc['sessions', userId] = len(sessions)
# [ ] first event date
result.loc['firstEvent', userId] = getFirstEventDate( userId )
# [ ] time played
# [ ] dates played
# [ ] first played, last played
sessionIds = sessions['sessionId']
for _sessionIdIndex in range(0, len(sessions['sessionId'])):
_sessionId = sessionIds.iloc[_sessionIdIndex]
sdp = getSessionDataPreview(_sessionId, _rmDF = _rmDF)
result.loc['session' + str(_sessionIdIndex) + ' platform',userId] = sdp['platform']
result.loc['session' + str(_sessionIdIndex) + ' first',userId] = sdp['first']
result.loc['session' + str(_sessionIdIndex) + ' last',userId] = sdp['last']
result.loc['session' + str(_sessionIdIndex) + ' events',userId] = str(sdp['events'])
# [ ] best chapter
# [ ] counts of events: deaths, crafts,...
# [ ] GF
result.loc['GFORM ANALYSIS'] = ' '
# [ ] score(s)
score = getScore( userId )
for _temporality in score.columns:
_score = score.loc[scoreLabel,_temporality]
if(len(_score)>0):
if(_temporality == 'before'):
_score = _score[len(_score)-1]
else:
_score = _score[0]
else:
_score = np.nan
result.loc[scoreLabel+_temporality,userId] = _score
# [ ] progression
# [ ] demographics
result.loc[scoreLabel+'s',userId] = str(score.values)
gfDataPreview = getGFormDataPreview(userId, gform)
features = {1: 'date', 2: 'temporality RM', 3: 'temporality GF', 4: 'score', 5: 'genderAge'}
for key in gfDataPreview:
for featureKey in features:
result.loc[key + ' ' + features[featureKey]] = str(gfDataPreview[key][features[featureKey]])
index = 0
for match in gfDataPreview[key]['demographic matches']:
result.loc[key + ' demographic match ' + str(index)] = repr(match)
index += 1
return result